import pandas as pd
import numpy as np
df_train = pd.read_csv("train_realestate.csv")
df_test= pd.read_csv("test_realestate.csv")
df_test.head()
| UID | BLOCKID | SUMLEVEL | COUNTYID | STATEID | state | state_ab | city | place | type | ... | female_age_mean | female_age_median | female_age_stdev | female_age_sample_weight | female_age_samples | pct_own | married | married_snp | separated | divorced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 255504 | NaN | 140 | 163 | 26 | Michigan | MI | Detroit | Dearborn Heights City | CDP | ... | 34.78682 | 33.75000 | 21.58531 | 416.48097 | 1938.0 | 0.70252 | 0.28217 | 0.05910 | 0.03813 | 0.14299 |
| 1 | 252676 | NaN | 140 | 1 | 23 | Maine | ME | Auburn | Auburn City | City | ... | 44.23451 | 46.66667 | 22.37036 | 532.03505 | 1950.0 | 0.85128 | 0.64221 | 0.02338 | 0.00000 | 0.13377 |
| 2 | 276314 | NaN | 140 | 15 | 42 | Pennsylvania | PA | Pine City | Millerton | Borough | ... | 41.62426 | 44.50000 | 22.86213 | 453.11959 | 1879.0 | 0.81897 | 0.59961 | 0.01746 | 0.01358 | 0.10026 |
| 3 | 248614 | NaN | 140 | 231 | 21 | Kentucky | KY | Monticello | Monticello City | City | ... | 44.81200 | 48.00000 | 21.03155 | 263.94320 | 1081.0 | 0.84609 | 0.56953 | 0.05492 | 0.04694 | 0.12489 |
| 4 | 286865 | NaN | 140 | 355 | 48 | Texas | TX | Corpus Christi | Edroy | Town | ... | 40.66618 | 42.66667 | 21.30900 | 709.90829 | 2956.0 | 0.79077 | 0.57620 | 0.01726 | 0.00588 | 0.16379 |
5 rows × 80 columns
df_train.head()
| UID | BLOCKID | SUMLEVEL | COUNTYID | STATEID | state | state_ab | city | place | type | ... | female_age_mean | female_age_median | female_age_stdev | female_age_sample_weight | female_age_samples | pct_own | married | married_snp | separated | divorced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 267822 | NaN | 140 | 53 | 36 | New York | NY | Hamilton | Hamilton | City | ... | 44.48629 | 45.33333 | 22.51276 | 685.33845 | 2618.0 | 0.79046 | 0.57851 | 0.01882 | 0.01240 | 0.08770 |
| 1 | 246444 | NaN | 140 | 141 | 18 | Indiana | IN | South Bend | Roseland | City | ... | 36.48391 | 37.58333 | 23.43353 | 267.23367 | 1284.0 | 0.52483 | 0.34886 | 0.01426 | 0.01426 | 0.09030 |
| 2 | 245683 | NaN | 140 | 63 | 18 | Indiana | IN | Danville | Danville | City | ... | 42.15810 | 42.83333 | 23.94119 | 707.01963 | 3238.0 | 0.85331 | 0.64745 | 0.02830 | 0.01607 | 0.10657 |
| 3 | 279653 | NaN | 140 | 127 | 72 | Puerto Rico | PR | San Juan | Guaynabo | Urban | ... | 47.77526 | 50.58333 | 24.32015 | 362.20193 | 1559.0 | 0.65037 | 0.47257 | 0.02021 | 0.02021 | 0.10106 |
| 4 | 247218 | NaN | 140 | 161 | 20 | Kansas | KS | Manhattan | Manhattan City | City | ... | 24.17693 | 21.58333 | 11.10484 | 1854.48652 | 3051.0 | 0.13046 | 0.12356 | 0.00000 | 0.00000 | 0.03109 |
5 rows × 80 columns
print(df_test.shape, df_train.shape)
(11709, 80) (27321, 80)
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 27321 entries, 0 to 27320 Data columns (total 80 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 UID 27321 non-null int64 1 BLOCKID 0 non-null float64 2 SUMLEVEL 27321 non-null int64 3 COUNTYID 27321 non-null int64 4 STATEID 27321 non-null int64 5 state 27321 non-null object 6 state_ab 27321 non-null object 7 city 27321 non-null object 8 place 27321 non-null object 9 type 27321 non-null object 10 primary 27321 non-null object 11 zip_code 27321 non-null int64 12 area_code 27321 non-null int64 13 lat 27321 non-null float64 14 lng 27321 non-null float64 15 ALand 27321 non-null float64 16 AWater 27321 non-null int64 17 pop 27321 non-null int64 18 male_pop 27321 non-null int64 19 female_pop 27321 non-null int64 20 rent_mean 27007 non-null float64 21 rent_median 27007 non-null float64 22 rent_stdev 27007 non-null float64 23 rent_sample_weight 27007 non-null float64 24 rent_samples 27007 non-null float64 25 rent_gt_10 27007 non-null float64 26 rent_gt_15 27007 non-null float64 27 rent_gt_20 27007 non-null float64 28 rent_gt_25 27007 non-null float64 29 rent_gt_30 27007 non-null float64 30 rent_gt_35 27007 non-null float64 31 rent_gt_40 27007 non-null float64 32 rent_gt_50 27007 non-null float64 33 universe_samples 27321 non-null int64 34 used_samples 27321 non-null int64 35 hi_mean 27053 non-null float64 36 hi_median 27053 non-null float64 37 hi_stdev 27053 non-null float64 38 hi_sample_weight 27053 non-null float64 39 hi_samples 27053 non-null float64 40 family_mean 27023 non-null float64 41 family_median 27023 non-null float64 42 family_stdev 27023 non-null float64 43 family_sample_weight 27023 non-null float64 44 family_samples 27023 non-null float64 45 hc_mortgage_mean 26748 non-null float64 46 hc_mortgage_median 26748 non-null float64 47 hc_mortgage_stdev 26748 non-null float64 48 hc_mortgage_sample_weight 26748 non-null float64 49 hc_mortgage_samples 26748 non-null float64 50 hc_mean 26721 non-null float64 51 hc_median 26721 non-null float64 52 hc_stdev 26721 non-null float64 53 hc_samples 26721 non-null float64 54 hc_sample_weight 26721 non-null float64 55 home_equity_second_mortgage 26864 non-null float64 56 second_mortgage 26864 non-null float64 57 home_equity 26864 non-null float64 58 debt 26864 non-null float64 59 second_mortgage_cdf 26864 non-null float64 60 home_equity_cdf 26864 non-null float64 61 debt_cdf 26864 non-null float64 62 hs_degree 27131 non-null float64 63 hs_degree_male 27121 non-null float64 64 hs_degree_female 27098 non-null float64 65 male_age_mean 27132 non-null float64 66 male_age_median 27132 non-null float64 67 male_age_stdev 27132 non-null float64 68 male_age_sample_weight 27132 non-null float64 69 male_age_samples 27132 non-null float64 70 female_age_mean 27115 non-null float64 71 female_age_median 27115 non-null float64 72 female_age_stdev 27115 non-null float64 73 female_age_sample_weight 27115 non-null float64 74 female_age_samples 27115 non-null float64 75 pct_own 27053 non-null float64 76 married 27130 non-null float64 77 married_snp 27130 non-null float64 78 separated 27130 non-null float64 79 divorced 27130 non-null float64 dtypes: float64(62), int64(12), object(6) memory usage: 16.7+ MB
df_train.columns
Index(['UID', 'BLOCKID', 'SUMLEVEL', 'COUNTYID', 'STATEID', 'state',
'state_ab', 'city', 'place', 'type', 'primary', 'zip_code', 'area_code',
'lat', 'lng', 'ALand', 'AWater', 'pop', 'male_pop', 'female_pop',
'rent_mean', 'rent_median', 'rent_stdev', 'rent_sample_weight',
'rent_samples', 'rent_gt_10', 'rent_gt_15', 'rent_gt_20', 'rent_gt_25',
'rent_gt_30', 'rent_gt_35', 'rent_gt_40', 'rent_gt_50',
'universe_samples', 'used_samples', 'hi_mean', 'hi_median', 'hi_stdev',
'hi_sample_weight', 'hi_samples', 'family_mean', 'family_median',
'family_stdev', 'family_sample_weight', 'family_samples',
'hc_mortgage_mean', 'hc_mortgage_median', 'hc_mortgage_stdev',
'hc_mortgage_sample_weight', 'hc_mortgage_samples', 'hc_mean',
'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight',
'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt',
'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree',
'hs_degree_male', 'hs_degree_female', 'male_age_mean',
'male_age_median', 'male_age_stdev', 'male_age_sample_weight',
'male_age_samples', 'female_age_mean', 'female_age_median',
'female_age_stdev', 'female_age_sample_weight', 'female_age_samples',
'pct_own', 'married', 'married_snp', 'separated', 'divorced'],
dtype='object')
df_test.columns
Index(['UID', 'BLOCKID', 'SUMLEVEL', 'COUNTYID', 'STATEID', 'state',
'state_ab', 'city', 'place', 'type', 'primary', 'zip_code', 'area_code',
'lat', 'lng', 'ALand', 'AWater', 'pop', 'male_pop', 'female_pop',
'rent_mean', 'rent_median', 'rent_stdev', 'rent_sample_weight',
'rent_samples', 'rent_gt_10', 'rent_gt_15', 'rent_gt_20', 'rent_gt_25',
'rent_gt_30', 'rent_gt_35', 'rent_gt_40', 'rent_gt_50',
'universe_samples', 'used_samples', 'hi_mean', 'hi_median', 'hi_stdev',
'hi_sample_weight', 'hi_samples', 'family_mean', 'family_median',
'family_stdev', 'family_sample_weight', 'family_samples',
'hc_mortgage_mean', 'hc_mortgage_median', 'hc_mortgage_stdev',
'hc_mortgage_sample_weight', 'hc_mortgage_samples', 'hc_mean',
'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight',
'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt',
'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree',
'hs_degree_male', 'hs_degree_female', 'male_age_mean',
'male_age_median', 'male_age_stdev', 'male_age_sample_weight',
'male_age_samples', 'female_age_mean', 'female_age_median',
'female_age_stdev', 'female_age_sample_weight', 'female_age_samples',
'pct_own', 'married', 'married_snp', 'separated', 'divorced'],
dtype='object')
df_test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11709 entries, 0 to 11708 Data columns (total 80 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 UID 11709 non-null int64 1 BLOCKID 0 non-null float64 2 SUMLEVEL 11709 non-null int64 3 COUNTYID 11709 non-null int64 4 STATEID 11709 non-null int64 5 state 11709 non-null object 6 state_ab 11709 non-null object 7 city 11709 non-null object 8 place 11709 non-null object 9 type 11709 non-null object 10 primary 11709 non-null object 11 zip_code 11709 non-null int64 12 area_code 11709 non-null int64 13 lat 11709 non-null float64 14 lng 11709 non-null float64 15 ALand 11709 non-null int64 16 AWater 11709 non-null int64 17 pop 11709 non-null int64 18 male_pop 11709 non-null int64 19 female_pop 11709 non-null int64 20 rent_mean 11561 non-null float64 21 rent_median 11561 non-null float64 22 rent_stdev 11561 non-null float64 23 rent_sample_weight 11561 non-null float64 24 rent_samples 11561 non-null float64 25 rent_gt_10 11560 non-null float64 26 rent_gt_15 11560 non-null float64 27 rent_gt_20 11560 non-null float64 28 rent_gt_25 11560 non-null float64 29 rent_gt_30 11560 non-null float64 30 rent_gt_35 11560 non-null float64 31 rent_gt_40 11560 non-null float64 32 rent_gt_50 11560 non-null float64 33 universe_samples 11709 non-null int64 34 used_samples 11709 non-null int64 35 hi_mean 11587 non-null float64 36 hi_median 11587 non-null float64 37 hi_stdev 11587 non-null float64 38 hi_sample_weight 11587 non-null float64 39 hi_samples 11587 non-null float64 40 family_mean 11573 non-null float64 41 family_median 11573 non-null float64 42 family_stdev 11573 non-null float64 43 family_sample_weight 11573 non-null float64 44 family_samples 11573 non-null float64 45 hc_mortgage_mean 11441 non-null float64 46 hc_mortgage_median 11441 non-null float64 47 hc_mortgage_stdev 11441 non-null float64 48 hc_mortgage_sample_weight 11441 non-null float64 49 hc_mortgage_samples 11441 non-null float64 50 hc_mean 11419 non-null float64 51 hc_median 11419 non-null float64 52 hc_stdev 11419 non-null float64 53 hc_samples 11419 non-null float64 54 hc_sample_weight 11419 non-null float64 55 home_equity_second_mortgage 11489 non-null float64 56 second_mortgage 11489 non-null float64 57 home_equity 11489 non-null float64 58 debt 11489 non-null float64 59 second_mortgage_cdf 11489 non-null float64 60 home_equity_cdf 11489 non-null float64 61 debt_cdf 11489 non-null float64 62 hs_degree 11624 non-null float64 63 hs_degree_male 11620 non-null float64 64 hs_degree_female 11604 non-null float64 65 male_age_mean 11625 non-null float64 66 male_age_median 11625 non-null float64 67 male_age_stdev 11625 non-null float64 68 male_age_sample_weight 11625 non-null float64 69 male_age_samples 11625 non-null float64 70 female_age_mean 11613 non-null float64 71 female_age_median 11613 non-null float64 72 female_age_stdev 11613 non-null float64 73 female_age_sample_weight 11613 non-null float64 74 female_age_samples 11613 non-null float64 75 pct_own 11587 non-null float64 76 married 11625 non-null float64 77 married_snp 11625 non-null float64 78 separated 11625 non-null float64 79 divorced 11625 non-null float64 dtypes: float64(61), int64(13), object(6) memory usage: 7.1+ MB
df_test.columns.value_counts().sum()
80
df_train.columns.value_counts().sum()
80
df_test.isnull().sum()
UID 0
BLOCKID 11709
SUMLEVEL 0
COUNTYID 0
STATEID 0
...
pct_own 122
married 84
married_snp 84
separated 84
divorced 84
Length: 80, dtype: int64
df_train.isnull().sum()
UID 0
BLOCKID 27321
SUMLEVEL 0
COUNTYID 0
STATEID 0
...
pct_own 268
married 191
married_snp 191
separated 191
divorced 191
Length: 80, dtype: int64
df_test.describe()
| UID | BLOCKID | SUMLEVEL | COUNTYID | STATEID | zip_code | area_code | lat | lng | ALand | ... | female_age_mean | female_age_median | female_age_stdev | female_age_sample_weight | female_age_samples | pct_own | married | married_snp | separated | divorced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 11709.000000 | 0.0 | 11709.0 | 11709.000000 | 11709.000000 | 11709.000000 | 11709.000000 | 11709.000000 | 11709.000000 | 1.170900e+04 | ... | 11613.000000 | 11613.000000 | 11613.000000 | 11613.000000 | 11613.000000 | 11587.000000 | 11625.000000 | 11625.000000 | 11625.000000 | 11625.000000 |
| mean | 257525.004783 | NaN | 140.0 | 85.710650 | 28.489196 | 50123.418396 | 593.598514 | 37.405491 | -91.340229 | 1.095500e+08 | ... | 40.111999 | 40.131864 | 22.148145 | 550.411243 | 2233.003186 | 0.634194 | 0.505632 | 0.047960 | 0.019346 | 0.099191 |
| std | 21466.372658 | NaN | 0.0 | 99.304334 | 16.607262 | 29775.134038 | 232.074263 | 5.625904 | 16.407818 | 7.624940e+08 | ... | 5.851192 | 7.972026 | 2.554907 | 280.992521 | 1072.017063 | 0.232232 | 0.139774 | 0.038693 | 0.021428 | 0.048525 |
| min | 220336.000000 | NaN | 140.0 | 1.000000 | 1.000000 | 601.000000 | 201.000000 | 17.965835 | -166.770979 | 8.299000e+03 | ... | 15.360240 | 12.833330 | 0.737110 | 0.251910 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 238819.000000 | NaN | 140.0 | 29.000000 | 13.000000 | 25570.000000 | 404.000000 | 33.919813 | -97.816561 | 1.718660e+06 | ... | 36.729210 | 34.750000 | 21.270920 | 363.225840 | 1499.000000 | 0.492500 | 0.422020 | 0.020890 | 0.004500 | 0.064590 |
| 50% | 257651.000000 | NaN | 140.0 | 61.000000 | 28.000000 | 47362.000000 | 612.000000 | 38.618093 | -86.643344 | 4.835000e+06 | ... | 40.196960 | 40.333330 | 22.472990 | 509.103610 | 2099.000000 | 0.687640 | 0.525270 | 0.038680 | 0.013870 | 0.094350 |
| 75% | 276300.000000 | NaN | 140.0 | 109.000000 | 42.000000 | 77406.000000 | 787.000000 | 41.232973 | -79.697311 | 3.204540e+07 | ... | 43.496490 | 45.333330 | 23.549450 | 685.883910 | 2800.000000 | 0.815235 | 0.605660 | 0.065340 | 0.027910 | 0.128400 |
| max | 294333.000000 | NaN | 140.0 | 810.000000 | 72.000000 | 99929.000000 | 989.000000 | 64.804269 | -65.695344 | 5.520166e+10 | ... | 90.107940 | 90.166670 | 29.626680 | 4145.557870 | 15466.000000 | 1.000000 | 1.000000 | 0.714290 | 0.714290 | 0.362750 |
8 rows × 74 columns
df_train.describe()
| UID | BLOCKID | SUMLEVEL | COUNTYID | STATEID | zip_code | area_code | lat | lng | ALand | ... | female_age_mean | female_age_median | female_age_stdev | female_age_sample_weight | female_age_samples | pct_own | married | married_snp | separated | divorced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 27321.000000 | 0.0 | 27321.0 | 27321.000000 | 27321.000000 | 27321.000000 | 27321.000000 | 27321.000000 | 27321.000000 | 2.732100e+04 | ... | 27115.000000 | 27115.000000 | 27115.000000 | 27115.000000 | 27115.000000 | 27053.000000 | 27130.000000 | 27130.000000 | 27130.000000 | 27130.000000 |
| mean | 257331.996303 | NaN | 140.0 | 85.646426 | 28.271806 | 50081.999524 | 596.507668 | 37.508813 | -91.288394 | 1.295106e+08 | ... | 40.319803 | 40.355099 | 22.178745 | 544.238432 | 2208.761903 | 0.640434 | 0.508300 | 0.047537 | 0.019089 | 0.100248 |
| std | 21343.859725 | NaN | 0.0 | 98.333097 | 16.392846 | 29558.115660 | 232.497482 | 5.588268 | 16.343816 | 1.275531e+09 | ... | 5.886317 | 8.039585 | 2.540257 | 283.546896 | 1089.316999 | 0.226640 | 0.136860 | 0.037640 | 0.020796 | 0.049055 |
| min | 220342.000000 | NaN | 140.0 | 1.000000 | 1.000000 | 602.000000 | 201.000000 | 17.929085 | -165.453872 | 4.113400e+04 | ... | 16.008330 | 13.250000 | 0.556780 | 0.664700 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 238816.000000 | NaN | 140.0 | 29.000000 | 13.000000 | 26554.000000 | 405.000000 | 33.899064 | -97.816067 | 1.799408e+06 | ... | 36.892050 | 34.916670 | 21.312135 | 355.995825 | 1471.000000 | 0.502780 | 0.425102 | 0.020810 | 0.004530 | 0.065800 |
| 50% | 257220.000000 | NaN | 140.0 | 63.000000 | 28.000000 | 47715.000000 | 614.000000 | 38.755183 | -86.554374 | 4.866940e+06 | ... | 40.373320 | 40.583330 | 22.514410 | 503.643890 | 2066.000000 | 0.690840 | 0.526665 | 0.038840 | 0.013460 | 0.095205 |
| 75% | 275818.000000 | NaN | 140.0 | 109.000000 | 42.000000 | 77093.000000 | 801.000000 | 41.380606 | -79.782503 | 3.359820e+07 | ... | 43.567120 | 45.416670 | 23.575260 | 680.275055 | 2772.000000 | 0.817460 | 0.605760 | 0.065100 | 0.027488 | 0.129000 |
| max | 294334.000000 | NaN | 140.0 | 840.000000 | 72.000000 | 99925.000000 | 989.000000 | 67.074017 | -65.379332 | 1.039510e+11 | ... | 79.837390 | 82.250000 | 30.241270 | 6197.995200 | 27250.000000 | 1.000000 | 1.000000 | 0.714290 | 0.714290 | 1.000000 |
8 rows × 74 columns
df_test.set_index(keys=['UID'], inplace=True)
df_train.set_index(keys=['UID'], inplace=True)
df_test.head(5)
| BLOCKID | SUMLEVEL | COUNTYID | STATEID | state | state_ab | city | place | type | primary | ... | female_age_mean | female_age_median | female_age_stdev | female_age_sample_weight | female_age_samples | pct_own | married | married_snp | separated | divorced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| UID | |||||||||||||||||||||
| 255504 | NaN | 140 | 163 | 26 | Michigan | MI | Detroit | Dearborn Heights City | CDP | tract | ... | 34.78682 | 33.75000 | 21.58531 | 416.48097 | 1938.0 | 0.70252 | 0.28217 | 0.05910 | 0.03813 | 0.14299 |
| 252676 | NaN | 140 | 1 | 23 | Maine | ME | Auburn | Auburn City | City | tract | ... | 44.23451 | 46.66667 | 22.37036 | 532.03505 | 1950.0 | 0.85128 | 0.64221 | 0.02338 | 0.00000 | 0.13377 |
| 276314 | NaN | 140 | 15 | 42 | Pennsylvania | PA | Pine City | Millerton | Borough | tract | ... | 41.62426 | 44.50000 | 22.86213 | 453.11959 | 1879.0 | 0.81897 | 0.59961 | 0.01746 | 0.01358 | 0.10026 |
| 248614 | NaN | 140 | 231 | 21 | Kentucky | KY | Monticello | Monticello City | City | tract | ... | 44.81200 | 48.00000 | 21.03155 | 263.94320 | 1081.0 | 0.84609 | 0.56953 | 0.05492 | 0.04694 | 0.12489 |
| 286865 | NaN | 140 | 355 | 48 | Texas | TX | Corpus Christi | Edroy | Town | tract | ... | 40.66618 | 42.66667 | 21.30900 | 709.90829 | 2956.0 | 0.79077 | 0.57620 | 0.01726 | 0.00588 | 0.16379 |
5 rows × 79 columns
df_train.head(5)
| BLOCKID | SUMLEVEL | COUNTYID | STATEID | state | state_ab | city | place | type | primary | ... | female_age_mean | female_age_median | female_age_stdev | female_age_sample_weight | female_age_samples | pct_own | married | married_snp | separated | divorced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| UID | |||||||||||||||||||||
| 267822 | NaN | 140 | 53 | 36 | New York | NY | Hamilton | Hamilton | City | tract | ... | 44.48629 | 45.33333 | 22.51276 | 685.33845 | 2618.0 | 0.79046 | 0.57851 | 0.01882 | 0.01240 | 0.08770 |
| 246444 | NaN | 140 | 141 | 18 | Indiana | IN | South Bend | Roseland | City | tract | ... | 36.48391 | 37.58333 | 23.43353 | 267.23367 | 1284.0 | 0.52483 | 0.34886 | 0.01426 | 0.01426 | 0.09030 |
| 245683 | NaN | 140 | 63 | 18 | Indiana | IN | Danville | Danville | City | tract | ... | 42.15810 | 42.83333 | 23.94119 | 707.01963 | 3238.0 | 0.85331 | 0.64745 | 0.02830 | 0.01607 | 0.10657 |
| 279653 | NaN | 140 | 127 | 72 | Puerto Rico | PR | San Juan | Guaynabo | Urban | tract | ... | 47.77526 | 50.58333 | 24.32015 | 362.20193 | 1559.0 | 0.65037 | 0.47257 | 0.02021 | 0.02021 | 0.10106 |
| 247218 | NaN | 140 | 161 | 20 | Kansas | KS | Manhattan | Manhattan City | City | tract | ... | 24.17693 | 21.58333 | 11.10484 | 1854.48652 | 3051.0 | 0.13046 | 0.12356 | 0.00000 | 0.00000 | 0.03109 |
5 rows × 79 columns
list_missing_df_train= df_train.isnull().sum()*100/len(df_train)
missing_values_df_train = pd.DataFrame(list_missing_df_train, columns= ['Percentage of missing values'])
missing_values_df_train.sort_values(by=['Percentage of missing values'], inplace = True , ascending=False)
missing_values_df_train[missing_values_df_train['Percentage of missing values']>0][:10]
| Percentage of missing values | |
|---|---|
| BLOCKID | 100.000000 |
| hc_samples | 2.196113 |
| hc_mean | 2.196113 |
| hc_median | 2.196113 |
| hc_stdev | 2.196113 |
| hc_sample_weight | 2.196113 |
| hc_mortgage_mean | 2.097288 |
| hc_mortgage_stdev | 2.097288 |
| hc_mortgage_sample_weight | 2.097288 |
| hc_mortgage_samples | 2.097288 |
list_missing_df_test= df_test.isnull().sum()*100/len(df_test)
missing_values_df_test = pd.DataFrame(list_missing_df_test, columns =['Percentage of missing values'])
missing_values_df_test.sort_values(by=['Percentage of missing values'], inplace=True, ascending=False)
missing_values_df_test[missing_values_df_test['Percentage of missing values']>0][:10]
| Percentage of missing values | |
|---|---|
| BLOCKID | 100.000000 |
| hc_samples | 2.476727 |
| hc_mean | 2.476727 |
| hc_median | 2.476727 |
| hc_stdev | 2.476727 |
| hc_sample_weight | 2.476727 |
| hc_mortgage_mean | 2.288838 |
| hc_mortgage_stdev | 2.288838 |
| hc_mortgage_sample_weight | 2.288838 |
| hc_mortgage_samples | 2.288838 |
df_train.drop(columns= ['BLOCKID', 'SUMLEVEL'], inplace=True)
df_test.drop(columns=['BLOCKID', 'SUMLEVEL'], inplace = True)
df_test.columns
Index(['COUNTYID', 'STATEID', 'state', 'state_ab', 'city', 'place', 'type',
'primary', 'zip_code', 'area_code', 'lat', 'lng', 'ALand', 'AWater',
'pop', 'male_pop', 'female_pop', 'rent_mean', 'rent_median',
'rent_stdev', 'rent_sample_weight', 'rent_samples', 'rent_gt_10',
'rent_gt_15', 'rent_gt_20', 'rent_gt_25', 'rent_gt_30', 'rent_gt_35',
'rent_gt_40', 'rent_gt_50', 'universe_samples', 'used_samples',
'hi_mean', 'hi_median', 'hi_stdev', 'hi_sample_weight', 'hi_samples',
'family_mean', 'family_median', 'family_stdev', 'family_sample_weight',
'family_samples', 'hc_mortgage_mean', 'hc_mortgage_median',
'hc_mortgage_stdev', 'hc_mortgage_sample_weight', 'hc_mortgage_samples',
'hc_mean', 'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight',
'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt',
'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree',
'hs_degree_male', 'hs_degree_female', 'male_age_mean',
'male_age_median', 'male_age_stdev', 'male_age_sample_weight',
'male_age_samples', 'female_age_mean', 'female_age_median',
'female_age_stdev', 'female_age_sample_weight', 'female_age_samples',
'pct_own', 'married', 'married_snp', 'separated', 'divorced'],
dtype='object')
missing_train_cols=[]
for col in df_train.columns:
if df_train[col].isna().sum() !=0:
missing_train_cols.append(col)
print(missing_train_cols)
['rent_mean', 'rent_median', 'rent_stdev', 'rent_sample_weight', 'rent_samples', 'rent_gt_10', 'rent_gt_15', 'rent_gt_20', 'rent_gt_25', 'rent_gt_30', 'rent_gt_35', 'rent_gt_40', 'rent_gt_50', 'hi_mean', 'hi_median', 'hi_stdev', 'hi_sample_weight', 'hi_samples', 'family_mean', 'family_median', 'family_stdev', 'family_sample_weight', 'family_samples', 'hc_mortgage_mean', 'hc_mortgage_median', 'hc_mortgage_stdev', 'hc_mortgage_sample_weight', 'hc_mortgage_samples', 'hc_mean', 'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight', 'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt', 'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree', 'hs_degree_male', 'hs_degree_female', 'male_age_mean', 'male_age_median', 'male_age_stdev', 'male_age_sample_weight', 'male_age_samples', 'female_age_mean', 'female_age_median', 'female_age_stdev', 'female_age_sample_weight', 'female_age_samples', 'pct_own', 'married', 'married_snp', 'separated', 'divorced']
missing_test_cols=[]
for col in df_test.columns:
if df_test[col].isna().sum() !=0:
missing_test_cols.append(col)
print(missing_test_cols)
['rent_mean', 'rent_median', 'rent_stdev', 'rent_sample_weight', 'rent_samples', 'rent_gt_10', 'rent_gt_15', 'rent_gt_20', 'rent_gt_25', 'rent_gt_30', 'rent_gt_35', 'rent_gt_40', 'rent_gt_50', 'hi_mean', 'hi_median', 'hi_stdev', 'hi_sample_weight', 'hi_samples', 'family_mean', 'family_median', 'family_stdev', 'family_sample_weight', 'family_samples', 'hc_mortgage_mean', 'hc_mortgage_median', 'hc_mortgage_stdev', 'hc_mortgage_sample_weight', 'hc_mortgage_samples', 'hc_mean', 'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight', 'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt', 'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree', 'hs_degree_male', 'hs_degree_female', 'male_age_mean', 'male_age_median', 'male_age_stdev', 'male_age_sample_weight', 'male_age_samples', 'female_age_mean', 'female_age_median', 'female_age_stdev', 'female_age_sample_weight', 'female_age_samples', 'pct_own', 'married', 'married_snp', 'separated', 'divorced']
for col in df_test.columns:
if col in (missing_test_cols):
df_test[col].replace(np.nan, df_test[col].mean(), inplace=True)
for col in df_train.columns:
if col in (missing_train_cols):
df_train[col].replace(np.nan, df_train[col].mean(), inplace = True)
df_test.isnull().sum()
COUNTYID 0
STATEID 0
state 0
state_ab 0
city 0
..
pct_own 0
married 0
married_snp 0
separated 0
divorced 0
Length: 77, dtype: int64
df_test.isnull().sum()
COUNTYID 0
STATEID 0
state 0
state_ab 0
city 0
..
pct_own 0
married 0
married_snp 0
separated 0
divorced 0
Length: 77, dtype: int64
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pandasql import sqldf
df_test.columns
Index(['COUNTYID', 'STATEID', 'state', 'state_ab', 'city', 'place', 'type',
'primary', 'zip_code', 'area_code', 'lat', 'lng', 'ALand', 'AWater',
'pop', 'male_pop', 'female_pop', 'rent_mean', 'rent_median',
'rent_stdev', 'rent_sample_weight', 'rent_samples', 'rent_gt_10',
'rent_gt_15', 'rent_gt_20', 'rent_gt_25', 'rent_gt_30', 'rent_gt_35',
'rent_gt_40', 'rent_gt_50', 'universe_samples', 'used_samples',
'hi_mean', 'hi_median', 'hi_stdev', 'hi_sample_weight', 'hi_samples',
'family_mean', 'family_median', 'family_stdev', 'family_sample_weight',
'family_samples', 'hc_mortgage_mean', 'hc_mortgage_median',
'hc_mortgage_stdev', 'hc_mortgage_sample_weight', 'hc_mortgage_samples',
'hc_mean', 'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight',
'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt',
'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree',
'hs_degree_male', 'hs_degree_female', 'male_age_mean',
'male_age_median', 'male_age_stdev', 'male_age_sample_weight',
'male_age_samples', 'female_age_mean', 'female_age_median',
'female_age_stdev', 'female_age_sample_weight', 'female_age_samples',
'pct_own', 'married', 'married_snp', 'separated', 'divorced'],
dtype='object')
Q1= "select place,pct_own,second_mortgage,lat,lng from df_train where pct_own >0.10 and second_mortgage <0.5 order by second_mortgage DESC LIMIT 2500;"
pysqldf = lambda q: sqldf(q, globals())
df_train_location_mort=pysqldf(Q1)
len(df_train_location_mort)
2500
#length = 2500
df_train_location_mort.head()
| place | pct_own | second_mortgage | lat | lng | |
|---|---|---|---|---|---|
| 0 | Worcester City | 0.20247 | 0.43363 | 42.254262 | -71.800347 |
| 1 | Harbor Hills | 0.15618 | 0.31818 | 40.751809 | -73.853582 |
| 2 | Glen Burnie | 0.22380 | 0.30212 | 39.127273 | -76.635265 |
| 3 | Egypt Lake-leto | 0.11618 | 0.28972 | 28.029063 | -82.495395 |
| 4 | Lincolnwood | 0.14228 | 0.28899 | 41.967289 | -87.652434 |
fig = go.Figure(data=go.Scattergeo(
lat = df_train_location_mort['lat'],
lon = df_train_location_mort['lng']),
)
fig.update_layout(
geo=dict(
scope = 'north america',
showland = True,
landcolor = "rgb(212, 212, 212)",
subunitcolor = "rgb(255, 255, 255)",
countrycolor = "rgb(255, 255, 255)",
showlakes = True,
lakecolor = "rgb(255, 255, 255)",
showsubunits = True,
showcountries = True,
resolution = 50,
projection = dict(
type = 'conic conformal',
rotation_lon = -100
),
lonaxis = dict(
showgrid = True,
gridwidth = 0.5,
range= [ -140.0, -55.0 ],
dtick = 5
),
lataxis = dict (
showgrid = True,
gridwidth = 0.5,
range= [ 20.0, 60.0 ],
dtick = 5
)
),
title='Top 2,500 locations with second mortgage is the highest and percent ownership is above 10 percent')
fig.show()
df_train['bad_debt']= df_train['second_mortgage']+ df_train['home_equity']-df_train['home_equity_second_mortgage']
df_train.columns
Index(['COUNTYID', 'STATEID', 'state', 'state_ab', 'city', 'place', 'type',
'primary', 'zip_code', 'area_code', 'lat', 'lng', 'ALand', 'AWater',
'pop', 'male_pop', 'female_pop', 'rent_mean', 'rent_median',
'rent_stdev', 'rent_sample_weight', 'rent_samples', 'rent_gt_10',
'rent_gt_15', 'rent_gt_20', 'rent_gt_25', 'rent_gt_30', 'rent_gt_35',
'rent_gt_40', 'rent_gt_50', 'universe_samples', 'used_samples',
'hi_mean', 'hi_median', 'hi_stdev', 'hi_sample_weight', 'hi_samples',
'family_mean', 'family_median', 'family_stdev', 'family_sample_weight',
'family_samples', 'hc_mortgage_mean', 'hc_mortgage_median',
'hc_mortgage_stdev', 'hc_mortgage_sample_weight', 'hc_mortgage_samples',
'hc_mean', 'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight',
'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt',
'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree',
'hs_degree_male', 'hs_degree_female', 'male_age_mean',
'male_age_median', 'male_age_stdev', 'male_age_sample_weight',
'male_age_samples', 'female_age_mean', 'female_age_median',
'female_age_stdev', 'female_age_sample_weight', 'female_age_samples',
'pct_own', 'married', 'married_snp', 'separated', 'divorced',
'bad_debt'],
dtype='object')
df_train['bins']= pd.cut(df_train['bad_debt'], bins = [0,0.10,1], labels = ["less than 50%" , "50-100%"])
df_train.groupby(['bins']).size().plot(kind='pie', subplots=True, startangle = 90, autopct='%1.1f%%')
plt.axis('equal')
plt.show()
cols = []
df_train.columns
Index(['COUNTYID', 'STATEID', 'state', 'state_ab', 'city', 'place', 'type',
'primary', 'zip_code', 'area_code', 'lat', 'lng', 'ALand', 'AWater',
'pop', 'male_pop', 'female_pop', 'rent_mean', 'rent_median',
'rent_stdev', 'rent_sample_weight', 'rent_samples', 'rent_gt_10',
'rent_gt_15', 'rent_gt_20', 'rent_gt_25', 'rent_gt_30', 'rent_gt_35',
'rent_gt_40', 'rent_gt_50', 'universe_samples', 'used_samples',
'hi_mean', 'hi_median', 'hi_stdev', 'hi_sample_weight', 'hi_samples',
'family_mean', 'family_median', 'family_stdev', 'family_sample_weight',
'family_samples', 'hc_mortgage_mean', 'hc_mortgage_median',
'hc_mortgage_stdev', 'hc_mortgage_sample_weight', 'hc_mortgage_samples',
'hc_mean', 'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight',
'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt',
'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree',
'hs_degree_male', 'hs_degree_female', 'male_age_mean',
'male_age_median', 'male_age_stdev', 'male_age_sample_weight',
'male_age_samples', 'female_age_mean', 'female_age_median',
'female_age_stdev', 'female_age_sample_weight', 'female_age_samples',
'pct_own', 'married', 'married_snp', 'separated', 'divorced',
'bad_debt', 'bins'],
dtype='object')
cols=['second_mortgage','home_equity','debt','bad_debt']
df_box_hamilton=df_train.loc[df_train['city'] == 'Hamilton']
df_box_manhattan=df_train.loc[df_train['city'] == 'Manhattan']
df_box_city=pd.concat([df_box_hamilton,df_box_manhattan])
df_box_city.head(4)
| COUNTYID | STATEID | state | state_ab | city | place | type | primary | zip_code | area_code | ... | female_age_stdev | female_age_sample_weight | female_age_samples | pct_own | married | married_snp | separated | divorced | bad_debt | bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| UID | |||||||||||||||||||||
| 267822 | 53 | 36 | New York | NY | Hamilton | Hamilton | City | tract | 13346 | 315 | ... | 22.51276 | 685.33845 | 2618.0 | 0.79046 | 0.57851 | 0.01882 | 0.01240 | 0.08770 | 0.09408 | less than 50% |
| 263797 | 21 | 34 | New Jersey | NJ | Hamilton | Yardville | City | tract | 8610 | 609 | ... | 24.05831 | 732.58443 | 3124.0 | 0.64400 | 0.56377 | 0.01980 | 0.00990 | 0.04892 | 0.18071 | 50-100% |
| 270979 | 17 | 39 | Ohio | OH | Hamilton | Hamilton City | Village | tract | 45015 | 513 | ... | 22.66500 | 565.32725 | 2528.0 | 0.61278 | 0.47397 | 0.04419 | 0.02663 | 0.13741 | 0.15005 | 50-100% |
| 259028 | 95 | 28 | Mississippi | MS | Hamilton | Hamilton | CDP | tract | 39746 | 662 | ... | 22.79602 | 483.01311 | 1954.0 | 0.83241 | 0.58678 | 0.01052 | 0.00000 | 0.11721 | 0.02130 | less than 50% |
4 rows × 79 columns
plt.figure(figsize=(10,5))
sns.boxplot(data=df_box_city , x='second_mortgage', y='city', width = 0.6, palette = "Set3")
plt.show()
plt.figure(figsize=(10,5))
sns.boxplot(data=df_box_city,x='home_equity', y='city',width=0.5,palette="Set3")
plt.show()
plt.figure(figsize=(10,5))
sns.boxplot(data=df_box_city,x='debt', y='city',width=0.5,palette="Set3")
plt.show()
plt.figure(figsize=(10,5))
sns.boxplot(data=df_box_city,x='bad_debt', y='city',width=0.5,palette="Set3")
plt.show()
Create a collated income distribution chart for family income, house hold income, and remaining income
sns.distplot(df_train['hi_mean'])
plt.title('Household income distribution chart')
plt.show()
C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
sns.distplot(df_train['family_mean'])
plt.title('Family income distribution chart')
plt.show()
C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
if 'family_mean' in df_train.columns:
print("Yes")
else:
print("No")
Yes
sns.distplot(df_train['family_mean']- df_train['hi_mean'])
plt.title('Reasoning income distribution chart ')
plt.show()
C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
Perform EDA and come out with insights into population density and age. You may have to derive new fields (make sure to weight averages for accurate measurements):
fig, (ax1, ax2, ax3)= plt.subplots(3,1)
sns.distplot(df_train['pop'], ax= ax1)
sns.distplot(df_train['male_pop'], ax=ax2)
sns.distplot(df_train['female_pop'], ax=ax3)
plt.subplots_adjust(wspace = 0.8 , hspace = 0.8)
plt.tight_layout()
plt.show()
C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
import warnings
warnings.filterwarnings("default")
fig,(ax1, ax2)= plt.subplots(2,1)
sns.distplot(df_train['male_age_mean'], ax= ax1)
sns.distplot(df_train['female_age_mean'], ax=ax2)
plt.subplots_adjust(wspace=0.8, hspace = 0.8)
plt.tight_layout()
plt.show()
C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
a) Use pop and ALand variables to create a new field called population density
df_train['pop_density']= df_train['pop']/df_train['ALand']
df_test['pop_density']= df_test['pop']/df_test['ALand']
sns.distplot(df_train['pop_density'])
plt.title('population Density')
plt.show()
C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
Use male_age_median, female_age_median, male_pop, and female_pop to create a new field called median age c) Visualize the findings using appropriate chart type
df_train['age_median']= (df_train['male_age_median']+ df_train['female_age_median'])/2
df_test['age_median']= (df_test['male_age_median']+ df_test['female_age_median'])/2
df_test[['male_age_median', 'female_age_median', 'male_pop', 'female_pop','age_median']].head()
| male_age_median | female_age_median | male_pop | female_pop | age_median | |
|---|---|---|---|---|---|
| UID | |||||
| 255504 | 27.83333 | 33.75000 | 1479 | 1938 | 30.791665 |
| 252676 | 46.08333 | 46.66667 | 1846 | 1950 | 46.375000 |
| 276314 | 41.91667 | 44.50000 | 2065 | 1879 | 43.208335 |
| 248614 | 43.00000 | 48.00000 | 1427 | 1081 | 45.500000 |
| 286865 | 43.75000 | 42.66667 | 3274 | 2956 | 43.208335 |
sns.distplot(df_train['age_median'])
plt.title('Median Age')
plt.show()
C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
sns.boxplot(df_train['age_median'])
plt.title('Population Density')
plt.show()
C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
Create bins for population into a new variable by selecting appropriate class interval so that the number of categories don’t exceed 5 for the ease of analysis
df_train['pop'].describe()
count 27321.000000 mean 4316.032685 std 2169.226173 min 0.000000 25% 2885.000000 50% 4042.000000 75% 5430.000000 max 53812.000000 Name: pop, dtype: float64
df_train['pop_bins']= pd.cut(df_train['pop'], bins=5, labels = ['very_low', 'low','medium','high','very high'])
df_train[['pop','pop_bins']]
| pop | pop_bins | |
|---|---|---|
| UID | ||
| 267822 | 5230 | very_low |
| 246444 | 2633 | very_low |
| 245683 | 6881 | very_low |
| 279653 | 2700 | very_low |
| 247218 | 5637 | very_low |
| ... | ... | ... |
| 279212 | 1847 | very_low |
| 277856 | 4155 | very_low |
| 233000 | 2829 | very_low |
| 287425 | 11542 | low |
| 265371 | 3726 | very_low |
27321 rows × 2 columns
df_train['pop_bins'].value_counts()
very_low 27058 low 246 medium 9 high 7 very high 1 Name: pop_bins, dtype: int64
Analyze the married, separated, and divorced population for these population brackets
df_train.groupby(by='pop_bins')[['married', 'separated', 'divorced']].count()
| married | separated | divorced | |
|---|---|---|---|
| pop_bins | |||
| very_low | 27058 | 27058 | 27058 |
| low | 246 | 246 | 246 |
| medium | 9 | 9 | 9 |
| high | 7 | 7 | 7 |
| very high | 1 | 1 | 1 |
df_train.groupby(by='pop_bins')[['married','separated','divorced']].agg(["mean", "median"])
| married | separated | divorced | ||||
|---|---|---|---|---|---|---|
| mean | median | mean | median | mean | median | |
| pop_bins | ||||||
| very_low | 0.507548 | 0.524680 | 0.019126 | 0.013650 | 0.100504 | 0.096020 |
| low | 0.584894 | 0.593135 | 0.015833 | 0.011195 | 0.075348 | 0.070045 |
| medium | 0.655737 | 0.618710 | 0.005003 | 0.004120 | 0.065927 | 0.064890 |
| high | 0.503359 | 0.335660 | 0.008141 | 0.002500 | 0.039030 | 0.010320 |
| very high | 0.734740 | 0.734740 | 0.004050 | 0.004050 | 0.030360 | 0.030360 |
plt.figure(figsize=(10,5))
pop_bin_married= df_train.groupby(by='pop_bins')[['married', 'separated', 'divorced']].agg(["mean"])
pop_bin_married.plot(figsize=(20,8))
plt.legend(loc='best')
plt.show()
<Figure size 720x360 with 0 Axes>
rent_state_mean= df_train.groupby(by='state')['rent_mean'].agg(["mean"])
rent_state_mean.head()
| mean | |
|---|---|
| state | |
| Alabama | 774.004927 |
| Alaska | 1185.763570 |
| Arizona | 1097.753511 |
| Arkansas | 720.918575 |
| California | 1471.133857 |
income_state_mean= df_train.groupby(by='state')['family_mean'].agg(["mean"])
income_state_mean.head()
| mean | |
|---|---|
| state | |
| Alabama | 67030.064213 |
| Alaska | 92136.545109 |
| Arizona | 73328.238798 |
| Arkansas | 64765.377850 |
| California | 87655.470820 |
rent_perc_of_income=rent_state_mean['mean']/income_state_mean['mean']
rent_perc_of_income.head(10)
state Alabama 0.011547 Alaska 0.012870 Arizona 0.014970 Arkansas 0.011131 California 0.016783 Colorado 0.013529 Connecticut 0.012637 Delaware 0.012929 District of Columbia 0.013198 Florida 0.015772 Name: mean, dtype: float64
sum(df_train['rent_mean'])/sum(df_train['family_mean'])
0.013358170721473864
Perform correlation analysis for all the relevant variables by creating a heatmap. Describe your findings.
df_train.columns
Index(['COUNTYID', 'STATEID', 'state', 'state_ab', 'city', 'place', 'type',
'primary', 'zip_code', 'area_code', 'lat', 'lng', 'ALand', 'AWater',
'pop', 'male_pop', 'female_pop', 'rent_mean', 'rent_median',
'rent_stdev', 'rent_sample_weight', 'rent_samples', 'rent_gt_10',
'rent_gt_15', 'rent_gt_20', 'rent_gt_25', 'rent_gt_30', 'rent_gt_35',
'rent_gt_40', 'rent_gt_50', 'universe_samples', 'used_samples',
'hi_mean', 'hi_median', 'hi_stdev', 'hi_sample_weight', 'hi_samples',
'family_mean', 'family_median', 'family_stdev', 'family_sample_weight',
'family_samples', 'hc_mortgage_mean', 'hc_mortgage_median',
'hc_mortgage_stdev', 'hc_mortgage_sample_weight', 'hc_mortgage_samples',
'hc_mean', 'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight',
'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt',
'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree',
'hs_degree_male', 'hs_degree_female', 'male_age_mean',
'male_age_median', 'male_age_stdev', 'male_age_sample_weight',
'male_age_samples', 'female_age_mean', 'female_age_median',
'female_age_stdev', 'female_age_sample_weight', 'female_age_samples',
'pct_own', 'married', 'married_snp', 'separated', 'divorced',
'bad_debt', 'bins', 'pop_density', 'age_median', 'pop_bins'],
dtype='object')
cor=df_train[['COUNTYID','STATEID','zip_code','type','pop', 'family_mean',
'second_mortgage', 'home_equity', 'debt','hs_degree',
'age_median','pct_own', 'married','separated', 'divorced']].corr()
plt.figure(figsize=(20,10))
sns.heatmap(cor, annot=True, cmap='coolwarm')
plt.show()
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer
fa = FactorAnalyzer(n_factors=5)
fa.fit_transform(df_train.select_dtypes(exclude = ('object', 'category')))
fa.loadings_
C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\factor_analyzer.py:368: DeprecationWarning: scipy.sum is deprecated and will be removed in SciPy 2.0.0, use numpy.sum instead C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\rotator.py:563: DeprecationWarning: scipy.diag is deprecated and will be removed in SciPy 2.0.0, use numpy.diag instead C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\rotator.py:565: DeprecationWarning: scipy.sqrt is deprecated and will be removed in SciPy 2.0.0, use numpy.lib.scimath.sqrt instead C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\rotator.py:579: DeprecationWarning: scipy.dot is deprecated and will be removed in SciPy 2.0.0, use numpy.dot instead C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\rotator.py:579: DeprecationWarning: scipy.diag is deprecated and will be removed in SciPy 2.0.0, use numpy.diag instead C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\rotator.py:584: DeprecationWarning: scipy.sqrt is deprecated and will be removed in SciPy 2.0.0, use numpy.lib.scimath.sqrt instead C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\rotator.py:584: DeprecationWarning: scipy.diag is deprecated and will be removed in SciPy 2.0.0, use numpy.diag instead C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\rotator.py:584: DeprecationWarning: scipy.dot is deprecated and will be removed in SciPy 2.0.0, use numpy.dot instead C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\rotator.py:585: DeprecationWarning: scipy.dot is deprecated and will be removed in SciPy 2.0.0, use numpy.dot instead C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\rotator.py:590: DeprecationWarning: scipy.sqrt is deprecated and will be removed in SciPy 2.0.0, use numpy.lib.scimath.sqrt instead C:\Users\bhumi\anaconda3\lib\site-packages\factor_analyzer\rotator.py:592: DeprecationWarning: scipy.dot is deprecated and will be removed in SciPy 2.0.0, use numpy.dot instead
array([[-1.12589166e-01, 1.95646468e-02, -2.39331083e-02,
-6.27632623e-02, 4.23474749e-02],
[-1.10186763e-01, 1.33506215e-02, 2.79651243e-02,
-1.49825864e-01, 1.10838807e-01],
[-8.28678646e-02, 5.16372377e-02, -1.36451871e-01,
-4.98918634e-02, -1.04024841e-01],
[ 1.80961149e-02, 1.92013753e-02, 5.81329827e-03,
2.64842740e-02, -6.12442486e-03],
[ 9.02324715e-02, -9.72544297e-02, -6.54601315e-02,
-1.33145899e-01, -1.48594601e-01],
[-1.07335697e-02, -4.12376818e-02, 1.45853484e-01,
8.80433327e-03, 1.08227565e-01],
[-4.28796971e-02, -2.09780214e-02, 3.66726851e-02,
-9.45597383e-02, 5.91380520e-02],
[-2.44243003e-03, -1.53245409e-02, -2.68300902e-03,
-4.52473044e-02, 2.37240659e-02],
[ 7.92164339e-02, 9.57453331e-01, -8.71151642e-02,
-6.59923845e-03, -3.97273184e-02],
[ 7.39808211e-02, 9.18750524e-01, -1.08834840e-01,
-2.79371590e-02, -3.93153640e-02],
[ 8.06598896e-02, 9.47839220e-01, -6.08006509e-02,
1.53627095e-02, -3.86977277e-02],
[ 7.70052137e-01, 9.84675329e-03, -3.71249754e-02,
1.14949046e-01, -1.23784684e-01],
[ 7.18615881e-01, 6.24980464e-03, -4.59787407e-02,
1.09109689e-01, -1.35301911e-01],
[ 7.07647246e-01, 2.46625399e-02, -1.00860846e-02,
1.04472488e-01, 7.72381251e-02],
[-1.34545492e-01, 3.36809297e-01, -4.87894959e-01,
-4.15446166e-02, 3.17608532e-01],
[ 2.31079697e-01, 4.37729787e-01, -6.40209196e-01,
-2.52310925e-02, 3.47216216e-01],
[-4.52068133e-02, 3.51263844e-02, 3.07537041e-02,
4.44793508e-01, -1.63273411e-01],
[-2.50717066e-02, 1.70166796e-02, 4.57227280e-02,
6.76083904e-01, -1.55256767e-01],
[-3.90694439e-02, -1.67460866e-02, 8.13962673e-02,
8.36389105e-01, -9.18259792e-02],
[-5.14161936e-02, -3.57207135e-02, 1.10795184e-01,
9.25123775e-01, -4.44866508e-02],
[-6.08589973e-02, -4.41860610e-02, 1.35794031e-01,
9.53019931e-01, -2.21548653e-02],
[-4.57771154e-02, -5.25526111e-02, 1.41019868e-01,
9.32702618e-01, -5.83072683e-07],
[-4.19486050e-02, -5.90387622e-02, 1.28851766e-01,
8.87316645e-01, 1.05894326e-02],
[-2.47894627e-02, -7.29670546e-02, 9.41510444e-02,
7.79023669e-01, 2.95352834e-02],
[ 2.12258459e-01, 4.65992346e-01, -6.14495951e-01,
-2.47660018e-02, 3.66644539e-01],
[ 2.33057249e-01, 4.47057850e-01, -6.28263427e-01,
-2.71547710e-02, 3.43419624e-01],
[ 7.85157098e-01, 4.91249258e-02, 1.44540484e-01,
-2.05217633e-01, -1.54523363e-01],
[ 7.10324880e-01, 4.99730440e-02, 1.32239990e-01,
-2.19171864e-01, -2.10505574e-01],
[ 8.61780953e-01, 4.35044836e-02, 1.65839099e-01,
-1.19850816e-01, 3.16733610e-02],
[-2.23443274e-01, 8.46259549e-01, -4.61177183e-02,
6.88599272e-02, 2.27742314e-01],
[ 1.43837557e-01, 9.53197424e-01, 2.27887469e-02,
-4.57890445e-02, 1.00796449e-01],
[ 8.30286496e-01, 3.42026003e-02, 1.61106001e-01,
-2.04570330e-01, -7.48710486e-02],
[ 7.94476585e-01, 2.83818596e-02, 1.51219548e-01,
-2.07681498e-01, -9.12497108e-02],
[ 8.11481669e-01, 4.32314899e-02, 1.43645563e-01,
-1.07778264e-01, 5.79540241e-02],
[-3.37741907e-01, 8.64927625e-01, 3.58933705e-02,
9.07183972e-02, 4.46327264e-02],
[ 5.03572654e-02, 9.35515351e-01, 1.51475403e-01,
-2.51501256e-02, -9.34471627e-02],
[ 9.78242247e-01, -3.31490292e-02, -1.05261173e-01,
4.50364254e-02, 7.37362061e-02],
[ 9.59137204e-01, -3.90023014e-02, -1.20630340e-01,
4.52591439e-02, 6.64877295e-02],
[ 8.14087167e-01, 2.23057235e-03, 7.66518536e-02,
2.02747428e-02, 1.27634815e-01],
[-4.15353984e-01, 7.18339585e-01, 3.40068065e-01,
-7.18402769e-02, -2.77950513e-01],
[ 7.64912662e-02, 7.24900629e-01, 2.74193203e-01,
-4.83952643e-02, -3.52988282e-01],
[ 9.10390833e-01, -5.36541214e-02, -4.68641819e-02,
-7.64183441e-04, 1.63870440e-01],
[ 8.73011872e-01, -5.30302307e-02, -5.89943125e-02,
-1.58989743e-03, 1.52417545e-01],
[ 7.55087663e-01, -3.56133824e-03, 5.39542590e-02,
4.24181436e-03, 2.58043475e-01],
[-1.23469882e-01, 6.07438109e-01, 6.33039195e-01,
-2.14798897e-02, 2.47973902e-01],
[-3.42866889e-01, 5.59526278e-01, 5.88213005e-01,
-2.51533548e-02, 2.18419885e-01],
[-1.60867206e-01, -1.53062590e-02, -1.57026584e-01,
1.09243754e-01, -6.61660805e-01],
[-1.37306764e-01, -2.17250646e-02, -1.58408933e-01,
1.25156195e-01, -6.71630806e-01],
[ 2.45096182e-01, -2.54584590e-02, -2.66691452e-02,
9.53148496e-02, -6.42510840e-01],
[ 2.03988656e-01, 7.85172835e-02, -3.01656228e-01,
2.28379491e-02, -6.29223365e-01],
[ 1.08926110e-01, -6.34332375e-02, -3.36565241e-02,
-9.49480582e-02, 6.81473893e-01],
[-2.63787624e-01, -6.43281163e-03, -3.58792147e-02,
-9.37962446e-02, 6.47816997e-01],
[-2.15717044e-01, -7.36588960e-02, 3.50113237e-01,
-1.95201626e-02, 6.36783769e-01],
[ 3.94306145e-01, 6.09565687e-02, 2.55337865e-01,
-2.20362099e-01, -1.84248084e-01],
[ 4.07877887e-01, 6.27256518e-02, 2.23926910e-01,
-2.10028737e-01, -1.71989227e-01],
[ 3.53156874e-01, 5.36715654e-02, 2.69603566e-01,
-2.16933217e-01, -1.80072068e-01],
[ 2.33537263e-01, -4.91732963e-02, 8.14450798e-01,
9.36688947e-02, 3.27131934e-01],
[ 2.40298212e-01, -3.38140094e-02, 8.31497001e-01,
7.52417674e-02, 2.46323616e-01],
[-6.71839510e-02, 6.58504550e-02, 5.86207693e-01,
8.72955244e-02, 9.12541350e-02],
[ 5.59835557e-02, 8.17918708e-01, -1.78458352e-01,
-1.55949438e-02, -3.34299731e-02],
[ 7.16426399e-02, 9.23428534e-01, -1.07142695e-01,
-2.78635371e-02, -4.35991120e-02],
[ 1.92496943e-01, -4.75870407e-02, 8.03173194e-01,
1.43492710e-01, 3.33862148e-01],
[ 1.87644429e-01, -3.29941023e-02, 8.58024491e-01,
1.31329954e-01, 2.55679719e-01],
[-1.02263658e-01, 6.03984260e-02, 4.72982256e-01,
7.36848384e-02, 1.12273907e-01],
[ 6.14776655e-02, 8.77962760e-01, -1.50410288e-01,
2.20991044e-02, -4.17158177e-02],
[ 7.83728218e-02, 9.54508791e-01, -5.91095909e-02,
1.64800936e-02, -4.32590999e-02],
[-3.24381907e-02, 1.11167165e-01, 7.84467399e-01,
-4.37718588e-02, -2.80931233e-01],
[ 1.76682389e-01, 1.90494237e-01, 5.61405482e-01,
-1.20746167e-01, -1.32570785e-01],
[-6.37386592e-02, -7.03047926e-02, -2.68934069e-01,
1.28589794e-01, 1.88507865e-01],
[-1.56051271e-01, -7.08033942e-02, -1.45964500e-01,
1.24253735e-01, 1.46293116e-01],
[-3.56716299e-01, -5.29910748e-02, 1.47771610e-01,
2.87196214e-02, 1.13159576e-01],
[ 2.42173821e-01, -2.86199139e-02, -3.25958384e-02,
1.05027822e-01, -6.55406092e-01],
[ 3.50196758e-01, -1.05016411e-02, -3.95274124e-01,
5.92876786e-02, 2.91651801e-01],
[ 2.25671546e-01, -3.42672751e-02, 8.92876642e-01,
1.12426818e-01, 2.67065205e-01]])
Data Modeling : Linear Regression
Build a linear Regression model to predict the total monthly expenditure for home mortgages loan. Please refer ‘deplotment_RE.xlsx’. Column hc_mortgage_mean is predicted variable. This is the mean monthly mortgage and owner costs of specified geographical location. Note: Exclude loans from prediction model which have NaN (Not a Number) values for hc_mortgage_mean.
df_train.columns
Index(['COUNTYID', 'STATEID', 'state', 'state_ab', 'city', 'place', 'type',
'primary', 'zip_code', 'area_code', 'lat', 'lng', 'ALand', 'AWater',
'pop', 'male_pop', 'female_pop', 'rent_mean', 'rent_median',
'rent_stdev', 'rent_sample_weight', 'rent_samples', 'rent_gt_10',
'rent_gt_15', 'rent_gt_20', 'rent_gt_25', 'rent_gt_30', 'rent_gt_35',
'rent_gt_40', 'rent_gt_50', 'universe_samples', 'used_samples',
'hi_mean', 'hi_median', 'hi_stdev', 'hi_sample_weight', 'hi_samples',
'family_mean', 'family_median', 'family_stdev', 'family_sample_weight',
'family_samples', 'hc_mortgage_mean', 'hc_mortgage_median',
'hc_mortgage_stdev', 'hc_mortgage_sample_weight', 'hc_mortgage_samples',
'hc_mean', 'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight',
'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt',
'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree',
'hs_degree_male', 'hs_degree_female', 'male_age_mean',
'male_age_median', 'male_age_stdev', 'male_age_sample_weight',
'male_age_samples', 'female_age_mean', 'female_age_median',
'female_age_stdev', 'female_age_sample_weight', 'female_age_samples',
'pct_own', 'married', 'married_snp', 'separated', 'divorced',
'bad_debt', 'bins', 'pop_density', 'age_median', 'pop_bins'],
dtype='object')
df_train['type'].unique()
type_dict = {'type':{'City':1,
'Urban':2,
'Town':3,
'CDP':4,
'Village':5,
'Borough':6}
}
df_train.replace(type_dict, inplace=True)
df_train['type'].unique()
array([1, 2, 3, 4, 5, 6], dtype=int64)
df_test.replace(type_dict, inplace=True)
df_test['type'].unique()
array([4, 1, 6, 3, 5, 2], dtype=int64)
len(df_test)
11709
feature_cols= ['COUNTYID', 'STATEID','zip_code','type','pop','family_mean',
'second_mortgage', 'home_equity','debt','hs_degree',
'age_median','pct_own','married','separated','divorced']
x_train = df_train[feature_cols]
y_train = df_train['hc_mortgage_mean']
x_test= df_test[feature_cols]
y_test = df_test['hc_mortgage_mean']
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score
x_train.head()
| COUNTYID | STATEID | zip_code | type | pop | family_mean | second_mortgage | home_equity | debt | hs_degree | age_median | pct_own | married | separated | divorced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| UID | |||||||||||||||
| 267822 | 53 | 36 | 13346 | 1 | 5230 | 67994.14790 | 0.02077 | 0.08919 | 0.52963 | 0.89288 | 44.666665 | 0.79046 | 0.57851 | 0.01240 | 0.08770 |
| 246444 | 141 | 18 | 46616 | 1 | 2633 | 50670.10337 | 0.02222 | 0.04274 | 0.60855 | 0.90487 | 34.791665 | 0.52483 | 0.34886 | 0.01426 | 0.09030 |
| 245683 | 63 | 18 | 46122 | 1 | 6881 | 95262.51431 | 0.00000 | 0.09512 | 0.73484 | 0.94288 | 41.833330 | 0.85331 | 0.64745 | 0.01607 | 0.10657 |
| 279653 | 127 | 72 | 927 | 2 | 2700 | 56401.68133 | 0.01086 | 0.01086 | 0.52714 | 0.91500 | 49.750000 | 0.65037 | 0.47257 | 0.02021 | 0.10106 |
| 247218 | 161 | 20 | 66502 | 1 | 5637 | 54053.42396 | 0.05426 | 0.05426 | 0.51938 | 1.00000 | 22.000000 | 0.13046 | 0.12356 | 0.00000 | 0.03109 |
x_test.head()
| COUNTYID | STATEID | zip_code | type | pop | family_mean | second_mortgage | home_equity | debt | hs_degree | age_median | pct_own | married | separated | divorced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| UID | |||||||||||||||
| 255504 | 163 | 26 | 48239 | 4 | 3417 | 53802.87122 | 0.06443 | 0.07651 | 0.63624 | 0.91047 | 30.791665 | 0.70252 | 0.28217 | 0.03813 | 0.14299 |
| 252676 | 1 | 23 | 4210 | 1 | 3796 | 85642.22095 | 0.01175 | 0.14375 | 0.64755 | 0.94290 | 46.375000 | 0.85128 | 0.64221 | 0.00000 | 0.13377 |
| 276314 | 15 | 42 | 14871 | 6 | 3944 | 65694.06582 | 0.01316 | 0.06497 | 0.45395 | 0.89238 | 43.208335 | 0.81897 | 0.59961 | 0.01358 | 0.10026 |
| 248614 | 231 | 21 | 42633 | 1 | 2508 | 44156.38709 | 0.00995 | 0.01741 | 0.41915 | 0.60908 | 45.500000 | 0.84609 | 0.56953 | 0.04694 | 0.12489 |
| 286865 | 355 | 48 | 78410 | 3 | 6230 | 123527.02420 | 0.00000 | 0.03440 | 0.63188 | 0.86297 | 43.208335 | 0.79077 | 0.57620 | 0.00588 | 0.16379 |
sc= StandardScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test = x_test.astype(int)
x_test_scaled = sc.fit_transform(x_test)
x_test_scaled.dtype
dtype('float64')
x_test_scaled = sc.fit_transform(x_test)
print("Original array: \n", x_test)
Original array:
COUNTYID STATEID zip_code type pop family_mean second_mortgage \
UID
255504 163 26 48239 4 3417 53802 0
252676 1 23 4210 1 3796 85642 0
276314 15 42 14871 6 3944 65694 0
248614 231 21 42633 1 2508 44156 0
286865 355 48 78410 3 6230 123527 0
... ... ... ... ... ... ... ...
238088 105 12 33810 1 5611 70786 0
242811 31 17 60609 5 2695 38912 0
250127 9 25 1841 1 7392 99484 0
241096 27 19 51401 1 5945 75066 0
287763 453 48 78745 3 4117 54913 0
home_equity debt hs_degree age_median pct_own married separated \
UID
255504 0 0 0 30 0 0 0
252676 0 0 0 46 0 0 0
276314 0 0 0 43 0 0 0
248614 0 0 0 45 0 0 0
286865 0 0 0 43 0 0 0
... ... ... ... ... ... ... ...
238088 0 0 0 57 0 0 0
242811 0 0 0 31 0 0 0
250127 0 0 0 39 0 0 0
241096 0 0 0 44 0 0 0
287763 0 0 0 35 0 0 0
divorced
UID
255504 0
252676 0
276314 0
248614 0
286865 0
... ...
238088 0
242811 0
250127 0
241096 0
287763 0
[11709 rows x 15 columns]
# regression model framework
linreg= LinearRegression()
linreg.fit(x_train_scaled, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
y_pred = linreg.predict(x_test_scaled)
print("Overall R2 score of liear regression model" , r2_score(y_test, y_pred))
print("Overall RMSE of linear regression model ", np.sqrt(mean_squared_error(y_test, y_pred)))
Overall R2 score of liear regression model 0.36860812889746264 Overall RMSE of linear regression model 498.56236112379895
state=df_train['STATEID'].unique()
state[0:5]
array([36, 18, 72, 20, 1], dtype=int64)
for i in [20,1,45]:
print("State ID-",i)
x_train_nation=df_train[df_train['COUNTYID']==i][feature_cols]
y_train_nation=df_train[df_train['COUNTYID']==i]['hc_mortgage_mean']
x_test_nation=df_test[df_test['COUNTYID']==i][feature_cols]
y_test_nation=df_test[df_test['COUNTYID']==i]['hc_mortgage_mean']
x_train_scaled_nation=sc.fit_transform(x_train_nation)
x_test_scaled_nation=sc.fit_transform(x_test_nation)
linreg.fit(x_train_scaled_nation,y_train_nation)
y_pred_nation=linreg.predict(x_test_scaled_nation)
print("Overall R2 score of linear regression model for state,",i,":-" ,r2_score(y_test_nation,y_pred_nation))
print("Overall RMSE of linear regression model for state,",i,":-" ,np.sqrt(mean_squared_error(y_test_nation,y_pred_nation)))
print("\n")
State ID- 20 Overall R2 score of linear regression model for state, 20 :- 0.6046603766461807 Overall RMSE of linear regression model for state, 20 :- 307.97188999314733 State ID- 1 Overall R2 score of linear regression model for state, 1 :- 0.8104382475484616 Overall RMSE of linear regression model for state, 1 :- 307.8275861848435 State ID- 45 Overall R2 score of linear regression model for state, 45 :- 0.7887446497855253 Overall RMSE of linear regression model for state, 45 :- 225.69615420724128
import scipy.stats as stats
z_test = stats.zscore(x_test)
z_test
| COUNTYID | STATEID | zip_code | type | pop | family_mean | second_mortgage | home_equity | debt | hs_degree | age_median | pct_own | married | separated | divorced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| UID | |||||||||||||||
| 255504 | 0.778341 | -0.149892 | -0.063291 | 0.967961 | -0.447854 | -0.782680 | -0.01307 | -0.024458 | -0.077552 | -0.074134 | -1.131871 | -0.035815 | -0.020669 | NaN | NaN |
| 252676 | -0.853077 | -0.330544 | -1.542071 | -0.817367 | -0.269222 | 0.218849 | -0.01307 | -0.024458 | -0.077552 | -0.074134 | 0.997159 | -0.035815 | -0.020669 | NaN | NaN |
| 276314 | -0.712090 | 0.813583 | -1.184006 | 2.158179 | -0.199467 | -0.408617 | -0.01307 | -0.024458 | -0.077552 | -0.074134 | 0.597966 | -0.035815 | -0.020669 | NaN | NaN |
| 248614 | 1.463134 | -0.450978 | -0.251577 | -0.817367 | -0.876286 | -1.086096 | -0.01307 | -0.024458 | -0.077552 | -0.074134 | 0.864094 | -0.035815 | -0.020669 | NaN | NaN |
| 286865 | 2.711874 | 1.174886 | 0.950047 | 0.372852 | 0.877977 | 1.410524 | -0.01307 | -0.024458 | -0.077552 | -0.074134 | 0.597966 | -0.035815 | -0.020669 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 238088 | 0.194253 | -0.992933 | -0.547911 | -0.817367 | 0.586228 | -0.248447 | -0.01307 | -0.024458 | -0.077552 | -0.074134 | 2.460866 | -0.035815 | -0.020669 | NaN | NaN |
| 242811 | -0.550963 | -0.691847 | 0.352174 | 1.563070 | -0.788148 | -1.251046 | -0.01307 | -0.024458 | -0.077552 | -0.074134 | -0.998806 | -0.035815 | -0.020669 | NaN | NaN |
| 250127 | -0.772513 | -0.210110 | -1.621638 | -0.817367 | 1.425654 | 0.654250 | -0.01307 | -0.024458 | -0.077552 | -0.074134 | 0.065708 | -0.035815 | -0.020669 | NaN | NaN |
| 241096 | -0.591245 | -0.571413 | 0.042910 | -0.817367 | 0.743650 | -0.113820 | -0.01307 | -0.024458 | -0.077552 | -0.074134 | 0.731030 | -0.035815 | -0.020669 | NaN | NaN |
| 287763 | 3.698781 | 1.174886 | 0.961299 | 0.372852 | -0.117928 | -0.747734 | -0.01307 | -0.024458 | -0.077552 | -0.074134 | -0.466549 | -0.035815 | -0.020669 | NaN | NaN |
11709 rows × 15 columns
residuals = y_test-y_pred
residuals
UID
255504 93.706155
252676 -402.176763
276314 -116.294042
248614 -274.097891
286865 -354.341041
...
238088 -514.600736
242811 605.986970
250127 -314.053468
241096 -535.067349
287763 310.346698
Name: hc_mortgage_mean, Length: 11709, dtype: float64
plt.hist(residuals)
(array([3.000e+00, 1.000e+01, 6.026e+03, 5.377e+03, 2.650e+02, 1.100e+01,
4.000e+00, 9.000e+00, 2.000e+00, 2.000e+00]),
array([-3744.3506714 , -2517.34353672, -1290.33640205, -63.32926738,
1163.6778673 , 2390.68500197, 3617.69213665, 4844.69927132,
6071.70640599, 7298.71354067, 8525.72067534]),
<BarContainer object of 10 artists>)
(array([3.000e+00, 1.000e+01, 6.026e+03, 5.377e+03, 2.650e+02, 1.100e+01, 4.000e+00, 9.000e+00, 2.000e+00, 2.000e+00]), array([-3744.3506714 , -2517.34353672, -1290.33640205, -63.32926738, 1163.6778673 , 2390.68500197, 3617.69213665, 4844.69927132, 6071.70640599, 7298.71354067, 8525.72067534]),
sns.distplot(residuals)
C:\Users\bhumi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
<AxesSubplot:xlabel='hc_mortgage_mean', ylabel='Density'>
plt.scatter(residuals , y_pred)
<matplotlib.collections.PathCollection at 0x219df079a30>